
library(seqinr)
library(stringr)
library(dplyr)
library(epiDisplay)
library(gmodels)
library(ggplot2)


TPP_GSB <- read.csv("D:/Pipeline comparisons/Writing/Data/GSB/Plasmodium/TPP_GSB.csv")
MQ_GSB <- read.csv("D:/Pipeline comparisons/Writing/Data/GSB/Plasmodium/MQ_GSB.csv")
PD_GSB <- read.csv("D:/Pipeline comparisons/Writing/Data/GSB/Plasmodium/PD_GSB.csv")


R_TPP_GSB <- dplyr::select(TPP_GSB,c("PROTEIN_LOC", "cat", "Amino"))
R_MQ_GSB <- dplyr::select(MQ_GSB,c("PROTEIN_LOC","cat", "Amino"))
R_PD_GSB <- dplyr::select(PD_GSB,c("PROTEIN_LOC","cat", "Amino"))


R_TPP_GSB <- R_TPP_GSB %>% 
  rename(
    TPP_cat = cat,
    TPP_amino = Amino
  )
R_MQ_GSB <- R_MQ_GSB %>% 
  rename(
    MQ_cat = cat,
    MQ_amino = Amino
  )
R_PD_GSB <- R_PD_GSB %>% 
  rename(
    PD_cat = cat,
    PD_amino = Amino
  )


merged_TPP_PD <- merge(R_TPP_GSB,R_PD_GSB, by="PROTEIN_LOC", all = TRUE)
All_merged <- merge(merged_TPP_PD,R_MQ_GSB, by="PROTEIN_LOC", all = TRUE)

All_merged$`Number of matches` <- rowSums(!is.na(All_merged[c('TPP_cat', 'MQ_cat', 'PD_cat')]))
All_merged$TPPvsMQ <- rowSums(!is.na(All_merged[c('TPP_cat', 'MQ_cat')]))
All_merged$TPPvsPD <- rowSums(!is.na(All_merged[c('TPP_cat', 'PD_cat')]))
All_merged$MQvsPD <- rowSums(!is.na(All_merged[c('MQ_cat', 'PD_cat')]))

All_merged$`TPP vs MQ` <- ifelse(All_merged$TPPvsMQ==2,"Y","N")
All_merged$`TPP vs PD` <- ifelse(All_merged$TPPvsPD==2,"Y","N")
All_merged$`MQ vs PD` <- ifelse(All_merged$MQvsPD==2,"Y","N")

All_merged$TPP_cat_num <- if_else(All_merged$TPP_cat=="Bronze",1,if_else(All_merged$TPP_cat=="Silver",2,3))
All_merged$MQ_cat_num <- if_else(All_merged$MQ_cat=="Bronze",1,if_else(All_merged$MQ_cat=="Silver",2,3))
All_merged$PD_cat_num <- if_else(All_merged$PD_cat=="Bronze",1,if_else(All_merged$PD_cat=="Silver",2,3))

All_merged$cat_num <- rowMeans(All_merged[,c("TPP_cat_num","MQ_cat_num","PD_cat_num")], na.rm=TRUE)
par(cex.lab=1.8)
par(cex.axis=1.8)
boxplot(cat_num ~ `Number of matches`, data = All_merged,xlab="Number of matches", ylab="Average nominal score")


ggplot(All_merged, aes(x=as.factor(`Number of matches`), fill=as.factor(`Number of matches`) )) + 
  geom_bar( ) +
  scale_fill_hue(c = 40) +
  theme(legend.position="none") + labs(x = "Number of matches") +
  geom_text(aes(label = ..count..), stat = "count", vjust = 1,size=6) +
  theme(text = element_text(size = 24)) + xlab("Number of matches")
